{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "data = pd.read_csv('../utils/dataset/UCI_Zoo_Data_Set/zoo.data.csv', header=None)\n",
    "data=data.drop([0],axis=1)    # 首列是animal_name，丢弃\n",
    "\n",
    "data.columns=list(range(len(data.columns)))    # 将列索引更改成从0开始"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ID3\n",
    "熵的计算：\n",
    "$$\n",
    "H(D)=-\\sum\\limits_{k=1}^{K}p_{k}{\\log}p_{k}\n",
    "$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def entropy(feature):\n",
    "    uni_val, cnt = np.unique(feature, return_counts=True)    # 返回独特值与计数\n",
    "    # 熵的计算\n",
    "    H = np.sum([(-cnt[i]/np.sum(cnt))*np.log2(cnt[i]/np.sum(cnt))\n",
    "                for i in range(len(uni_val))])\n",
    "    return H\n",
    "\n",
    "# entropy(data.iloc[:,1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "信息增益计算：\n",
    "$$\n",
    "H(D|A)=\\sum\\limits_{v}^{V}\\frac{|D_{v}|}{|D|}H(D_{v}) \\\\\n",
    "G(D|A)=H(D)-H(D|A) \\\\\n",
    "$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def InfoGain(dataset, f_test_col, Y_col=-1):\n",
    "    entropy_before = entropy(dataset.iloc[:, Y_col])    # 分割前的熵\n",
    "\n",
    "    uni_val, cnt = np.unique(dataset.iloc[:, f_test_col],return_counts=True)    # 计算分割特征的独特值与计数\n",
    "    entropy_cond = np.sum([(cnt[i]/np.sum(cnt))*entropy(dataset.where(dataset.iloc[:, f_test_col]\n",
    "                                                                      == uni_val[i]).dropna().iloc[:, Y_col]) for i in range(len(uni_val))])\n",
    "    return entropy_before-entropy_cond\n",
    "\n",
    "# InfoGain(data,0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "ID3的基本雏形。使用递归来进行分裂，注意边界条件与每轮的特征删除。返回值是一棵以字典形式储存的分类树，叶节点为label。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ID3(dataset, org_dataset, f_cols, Y_col=-1, p_node_cls=None):\n",
    "    '''\n",
    "    dataset: 用于分割的数据\n",
    "    org_dataset: 用于计算优势类别的数据，父节点数据\n",
    "    f_cols: 备选特征\n",
    "    '''\n",
    "    # 如果数据中的Y已经纯净了，则返回Y的取值\n",
    "    if len(np.unique(dataset.iloc[:, Y_col])) <= 1:\n",
    "        return np.unique(dataset.iloc[:, Y_col])[0]\n",
    "\n",
    "    # 如果传入数据为空(对应空叶节点)，则返回原始数据中数量较多的label值\n",
    "    elif len(dataset) == 0:\n",
    "        uni_cls, cnt = np.unique(\n",
    "            org_dataset.iloc[:, Y_col], return_counts=True)\n",
    "        return uni_cls[np.argmax(cnt)]\n",
    "\n",
    "    # 如果没有特征可用于划分，则返回父节点中数量较多的label值\n",
    "    # 由于初始传入的是Index类型，所以这里不能用if not\n",
    "    elif len(f_cols) == 0:\n",
    "        return p_node_cls\n",
    "\n",
    "    # 否则进行分裂\n",
    "    else:\n",
    "        # 得到当前节点中数量最多的label，递归时会赋给下层函数的p_node_cls\n",
    "        cur_uni_cls, cnt = np.unique(\n",
    "            dataset.iloc[:, Y_col], return_counts=True)\n",
    "        cur_node_cls = cur_uni_cls[np.argmax(cnt)]\n",
    "        del cur_uni_cls, cnt\n",
    "\n",
    "        # 根据信息增益选出最佳分裂特征\n",
    "        gains = [InfoGain(dataset, f_col, Y_col) for f_col in f_cols]\n",
    "        best_f = f_cols[np.argmax(gains)]\n",
    "\n",
    "        # 更新备选特征\n",
    "        f_cols = [col for col in f_cols if col != best_f]\n",
    "\n",
    "        # 按最佳特征的不同取值，划分数据集并递归\n",
    "        tree = {best_f: {}}\n",
    "        for val in np.unique(dataset.iloc[:, best_f]):    # ID3对每一个取值都划分数据集\n",
    "            sub_data = dataset.where(dataset.iloc[:, best_f] == val).dropna()\n",
    "            tree[best_f][val] = ID3(sub_data, dataset, f_cols, Y_col, cur_node_cls)    # 分裂特征的某一取值，对应一颗子树或叶节点\n",
    "\n",
    "        return tree\n",
    "    \n",
    "# ID3(data,data,list(range(len(data.columns)-1)),-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 生成树是一个字典形式，并且树模型需要从根节点开始判定，预测也可以通过递归实现\n",
    "# 树的根节点通过tree.keys()获取，通过查询键值来得到左右子树的根节点\n",
    "# 为了便于查找，将输入的测试样本也变成字典形式，特征名为key，特征值为val\n",
    "\n",
    "\n",
    "def predict(query, tree, default=-1):\n",
    "    '''\n",
    "    query：一个测试样本，字典形式，{f:val,f:val,...}\n",
    "    tree：生成树\n",
    "    default：查找失败时返回的默认类别\n",
    "    '''\n",
    "    for feature in list(query.keys()):\n",
    "        if feature in list(tree.keys()):    # 如果该特征与根节点的划分特征相同\n",
    "            try:\n",
    "                sub_tree = tree[feature][query[feature]]    # 根据特征的取值来获取子节点\n",
    "\n",
    "                if isinstance(sub_tree, dict):    # 判断是否还有子树\n",
    "                    return predict(query, sub_tree)    # 有则继续查找\n",
    "                else:\n",
    "                    return sub_tree    # 是叶节点则返回结果\n",
    "            except:    # 没有查到则说明是未见过的情况，只能返回default\n",
    "                return default"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "acc:0.8571428571428571\n"
     ]
    }
   ],
   "source": [
    "train_data=data.iloc[:80].reset_index(drop=True)\n",
    "test_data=data.iloc[80:].reset_index(drop=True)\n",
    "\n",
    "# 训练模型\n",
    "tree=ID3(train_data,train_data,list(range(train_data.shape[1]-1)),-1)\n",
    "\n",
    "# DF转dict，一个条目为一个字典，返回一个字典的列表\n",
    "X_test=test_data.iloc[:,:-1].to_dict(orient = \"records\")\n",
    "Y_test=list(test_data.iloc[:,-1])\n",
    "Y_pred=list()\n",
    "\n",
    "for item in X_test:\n",
    "    Y_pred.append(predict(item,tree))\n",
    "    \n",
    "print('acc:{}'.format(np.sum(np.array(Y_test)==np.array(Y_pred))/len(Y_test)))"
   ]
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
